**************************
***	RUN USING STATA 16 ***
**************************

* Purpose: combine irt parameters, generate student abilities, calculate scores, output distribution, calculate noise estimate
* Last updated: 20 Apr 2023

*---------------------------------------------------------------------------------------
* using item parameters
*---------------------------------------------------------------------------------------

	use "$output\mat_obj_par.dta", clear
	
    keep item disc diff
	ren disc a
	ren diff b1

	tempfile o
	save `o'
	
	use "$output\mat_sub_par.dta", clear	

		append using `o'
		
		//update values for constructed response categories
		forvalues k = 1/7 {
			replace v`k' = v`k'*2
		}

		order item, first
		
		drop options		
		egen options = rownonmiss(b1- b7)
			
		//create year indicator
		gen yr = substr(item,3,1)
		destring yr, replace
		replace yr = yr+2010
			
		*remove TIMSS items
		gen length=length(item)
		drop if length==6
		drop length
		
		//simulate students
		drop x
		sort item
		gen x=_n
		
		set seed 60420
		forvalues s = 1/1000 {
		gen s_`s' = rnormal(1.2,1)
		gen st_`s' = s_`s' if x==1
		egen ability_`s' = mean(st_`s')
		drop s_* st_*
		}
		
		forvalues t = 1/1000 {
			
			gen n1_`t' = exp(a*(ability_`t'-b1))
			gen n2_`t' = exp(a*((2*ability_`t')-b1-b2))
			gen n3_`t' = exp(a*((3*ability_`t')-b1-b2-b3))
			gen n4_`t' = exp(a*((4*ability_`t')-b1-b2-b3-b4))
			gen n5_`t' = exp(a*((5*ability_`t')-b1-b2-b3-b4-b5))
			gen n6_`t' = exp(a*((6*ability_`t')-b1-b2-b3-b4-b5-b6))
			gen n7_`t' = exp(a*((7*ability_`t')-b1-b2-b3-b4-b5-b6-b7))
			
			gen d_`t' = 1 + n1_`t' + n2_`t' if options==2
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' if options==3
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' if options==4
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' if options==5
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' if options==6
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' + n7_`t' if options==7
		
			gen p1_`t' = n1_`t'/d_`t'
			gen p2_`t' = n2_`t'/d_`t'
			gen p3_`t' = n3_`t'/d_`t'
			gen p4_`t' = n4_`t'/d_`t'
			gen p5_`t' = n5_`t'/d_`t'
			gen p6_`t' = n6_`t'/d_`t'
			gen p7_`t' = n7_`t'/d_`t'
		
			gen exp_`t' = v1*p1_`t'
			replace exp_`t' = exp_`t' + v2*p2_`t' 	if p2_`t'!=.
			replace exp_`t' = exp_`t' + v3*p3_`t' 	if p3_`t'!=.
			replace exp_`t' = exp_`t' + v4*p4_`t' 	if p4_`t'!=.
			replace exp_`t' = exp_`t' + v5*p5_`t' 	if p5_`t'!=.
			replace exp_`t' = exp_`t' + v6*p6_`t' 	if p6_`t'!=.
			replace exp_`t' = exp_`t' + v7*p7_`t' 	if p7_`t'!=.
		}
		
		
		forvalues t = 1/1000 {
		gen p0_`t' = (exp(a*(ability_`t'-b1)))/(1+exp(a*(ability_`t'-b1))) if options==1
		}
		
		forvalues u = 1/1000 {
		egen scr_s_`u' = sum(exp_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		egen scr_o_`u' = sum(p0_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		gen tot`u' = scr_s_`u' + scr_o_`u' 
		}
		
		
	**calculate total marks on offer for each year - accounts for omitted objective and subjective items
		gen type = substr(item, 1, 1)
		gen location = substr(item, 4,2)
		destring location, replace
		gen marks = 1
		replace marks = 8 if type=="s" & location<=5
		replace marks = 12 if type== "s" & location>5

		egen marks_yr = sum(marks), by(yr)
		
		**these marks_yr already incorporate missing items from objective tests
		**make manual adjustment for seleced subjective items which are not currently scored from 8 or 12
		replace marks_yr = marks_yr - 9 if yr==2011
		replace marks_yr = marks_yr - 8 if yr==2012
		replace marks_yr = marks_yr - 5 if yr==2015
		replace marks_yr = marks_yr - 2 if yr==2016
		replace marks_yr = marks_yr - 11 if yr==2018 /*NOTE: for 2018, item s_808 is omitted from analysis already, so 12 points already deducted*/
	
		save "$output\mat_score_estimates.dta", replace
			
*-------------------------------------------------------------------------------
*** generate estimate of noise used in Section V. Estimate a within-candidate correlation of test scores
*-------------------------------------------------------------------------------
		preserve
		
	***	keep one observation by year, and scores for 1000 candidates
		duplicates drop yr, force
		keep yr tot* marks_yr
		order yr marks_yr
		drop if yr==2017 // omit 2017, as we have already omitted it in calculating overall variance
		
	*** reshape to return candidateXyear	
		reshape long tot, i(yr) j(t)
		gen pc = tot/marks_yr
		drop marks_yr	

	*** pick two random scores for each candidate
	*** create random number for selection
		set seed 123456
		gen rand = invnorm(runiform())
		
	*** take min and max random numbers for the sake of it
		egen v1 = min(rand), by(t)
		egen v2 = max(rand), by(t)
		gen v = 1
		replace v = 2 if v2==rand
		keep if v1 == rand | v2 == rand
		keep t pc v
		
	*** rehsape to original setup
		reshape wide pc, i(t) j(v)
		
	*** look at score correlation
		correl pc1 pc2 // == 0.8351
		
		restore
		
*-------------------------------------------------------------------------------
* generate pass rate estimates
*-------------------------------------------------------------------------------

*** keep one observation by year and look at distributions
		egen pick = tag(yr)
		keep if pick==1
		keep yr tot* marks_yr ability*
		
		//generate shares reaching certain thresholds	
		reshape long tot ability_, i(yr) j(t)
		
		drop t //remove unnecessary
		sort yr ability_ 
		bysort yr : gen n = _n //define rank order
		gen pct = tot/marks_yr //gen percent score
		
*** set test difficulty by pooling across years (omitting 2017 anomaly)
		gen g = .
		replace g = pct if yr==2011 & n==231
		replace g = pct if yr==2012 & n==185
		replace g = pct if yr==2013 & n==281
		replace g = pct if yr==2014 & n==319
		replace g = pct if yr==2015 & n==457
		replace g = pct if yr==2016 & n==381
		replace g = pct if yr==2018 & n==317
		replace g = pct if yr==2019 & n==135
		egen jp_99 = mean(g)

		gen h = .
		replace h = pct if yr==2011 & n==558
		replace h = pct if yr==2012 & n==498
		replace h = pct if yr==2013 & n==633
		replace h = pct if yr==2014 & n==672
		replace h = pct if yr==2015 & n==756
		replace h = pct if yr==2016 & n==664
		replace h = pct if yr==2018 & n==619
		replace h = pct if yr==2019 & n==347
		egen jc_99 = mean(h)

		gen c_99 = (pct>=jc_99) //identify credit passes
		gen p_99 = (pct>=jp_99 & c_99!=1) //identify passes that are not with credit
		
		collapse (sum) c_* p_*, by(yr)
		reshape long c_ p_, i(yr) j(cal)
		
		replace cal = cal+2000
		replace c_ = c_/10
		replace p_ = p_/10
	
		save "$output\mat_pass_estimates.dta", replace